# load the config file
yaml.file <- yaml.load_file('config_ongoing_run.yaml')
meta <- read.table(yaml.file$METAFILE, sep='\t', header=T)
#Argument Parser
RDATA_DMT <- params$param1
CONDI <- params$param2
RDATA_PREP_FIGS <- params$param3
PLOT_RDATA <- params$param4
OUTPUT_PATH <- params$param5
control.cond <- unlist(strsplit(CONDI, "_"))[1]
treat.cond <- unlist(strsplit(CONDI, "_"))[2]
mark <- unlist(strsplit(CONDI, "_"))[3]
LEVEL <- yaml.file$LEVEL
CUSTOM_ANNOT <- yaml.file$CUSTOM_ANNOT
# colors - pinkish
#full.color = "#de0000"
#high.color = "#ff8092"
#mid.color = "#f5e0e8"
#low.color = "#996d9a"
#unmeth.color = "#0a1661"
#hyper.color = "#de0000"
#hypo.color = "#0a1661"
#not.color = "#f5e0e8"
# moins rose
full.color = "#de0000"
high.color = "#e17e65"
mid.color = "#c6c6c6"
low.color = "#7c6fac"
unmeth.color = "#0f208f"
hyper.color = "#de0000"
hypo.color = "#0f208f"
not.color = "#c6c6c6"
# couleurs Elouan
#full.color = "#342A1F",
#high.color = "#C59434",
#mid.color = "#6F7498",
#low.color = "#A3B7F9",
#unmeth.color = "#F4D4D4"
titre <- paste0("Differential Methylation Analysis on ", LEVEL, " (", CONDI, ")")
MKit_differential.Rmd : Developped & Maintained by Olivier Kirsh and Elouan Bethuel.
Perform a differential methylation analysis between pair of conditions.
Launch as Rscript on HPC cluster with the workflow : WGBSflow.
Requires 2 RData environments produced by the following R scripts :
- MKit_diff_bed.R which performs MethylKit DM CpG (DMC) or Tiles (DMT) analysis,
- MKit_diff_fig.R which builds all dataframes (wide and long format) needed to build the figures.
#load packages
library(ggplot2)
library(dbplyr)
library(magrittr)
library(methylKit)
library(yaml)
library(stringr)
load(RDATA_DMT)
load(RDATA_PREP_FIGS)
# extract the information from the yaml file
MINCOV <- yaml.file$MINCOV
MINQUALI <- yaml.file$MINQUALI
UNITE <- yaml.file$UNITE
TILESIZE <- yaml.file$TILESIZE
STEPSIZE <- yaml.file$STEPSIZE
SIGNIDIF <- yaml.file$SIGNIDIF
QVALUE <- yaml.file$QVALUE
DESTRAND <- yaml.file$DESTRAND
merge_annot <- yaml.file$MERGE_WITH_BASICS_ANNOT
Analysis on CpG or Tiles - LEVEL : CpG
Merge reads from both strands? DESTRAND : TRUE
Minimum coverage - MINCOV : 5
Discards the bases that have more than COV.PERCth percentile of coverage - COV.PERC : 99.9
Type of unification (all or at least one per group) - UNITE : all
Threshold for significant differential methylation in % : 20
Qvalue for significant differential methylation: 0.05
head(dm_condi, n = 20)
## [[1]]
## methylRaw object with 68609 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3050120 3050120 - 1 1 0
## 2 chr19 3051793 3051793 - 1 1 0
## 3 chr19 3051860 3051860 + 1 0 1
## 4 chr19 3051902 3051902 + 1 1 0
## 5 chr19 3051907 3051907 + 1 1 0
## 6 chr19 3051940 3051940 + 1 0 1
## --------------
## sample.id: WT_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[2]]
## methylRaw object with 83572 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3050119 3050119 + 1 0 1
## 2 chr19 3050120 3050120 - 1 0 1
## 3 chr19 3050317 3050317 - 1 0 1
## 4 chr19 3050813 3050813 - 1 0 1
## 5 chr19 3050818 3050818 - 1 0 1
## 6 chr19 3050955 3050955 + 1 1 0
## --------------
## sample.id: DKO_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[3]]
## methylRaw object with 76702 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3051819 3051819 + 1 0 1
## 2 chr19 3051821 3051821 + 1 1 0
## 3 chr19 3051860 3051860 + 1 0 1
## 4 chr19 3051903 3051903 - 2 0 2
## 5 chr19 3051908 3051908 - 1 0 1
## 6 chr19 3051914 3051914 - 1 0 1
## --------------
## sample.id: WT_2
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[4]]
## methylRaw object with 67543 rows
## --------------
## chr start end strand coverage numCs numTs
## 1 chr19 3051154 3051154 + 2 0 2
## 2 chr19 3051895 3051895 - 1 0 1
## 3 chr19 3051903 3051903 - 2 0 2
## 4 chr19 3051908 3051908 - 2 1 1
## 5 chr19 3051914 3051914 - 2 1 1
## 6 chr19 3052077 3052077 + 1 1 0
## --------------
## sample.id: DKO_2
## assembly: mm39
## context: CpG
## resolution: base
head(f.dm_condi, n = 20)
## [[1]]
## methylRaw object with 37429 rows
## --------------
## chr start end strand coverage numCs numTs
## 14 chr19 3065878 3065878 - 6 0 6
## 34 chr19 3078956 3078956 - 7 5 2
## 35 chr19 3079413 3079413 + 7 6 1
## 36 chr19 3079414 3079414 - 6 5 1
## 40 chr19 3079526 3079526 - 8 7 1
## 41 chr19 3079664 3079664 - 6 3 3
## --------------
## sample.id: WT_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[2]]
## methylRaw object with 32249 rows
## --------------
## chr start end strand coverage numCs numTs
## 40 chr19 3078603 3078603 + 9 1 8
## 41 chr19 3078604 3078604 - 5 1 4
## 42 chr19 3078649 3078649 + 7 1 6
## 47 chr19 3078832 3078832 - 6 0 6
## 50 chr19 3078956 3078956 - 7 2 5
## 53 chr19 3079445 3079445 - 6 2 4
## --------------
## sample.id: DKO_1
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[3]]
## methylRaw object with 47842 rows
## --------------
## chr start end strand coverage numCs numTs
## 39 chr19 3078603 3078603 + 7 2 5
## 40 chr19 3078604 3078604 - 5 5 0
## 41 chr19 3078649 3078649 + 5 3 2
## 42 chr19 3078650 3078650 - 9 9 0
## 50 chr19 3078956 3078956 - 6 5 1
## 51 chr19 3079413 3079413 + 7 6 1
## --------------
## sample.id: WT_2
## assembly: mm39
## context: CpG
## resolution: base
##
##
## [[4]]
## methylRaw object with 44619 rows
## --------------
## chr start end strand coverage numCs numTs
## 44 chr19 3078832 3078832 - 5 0 5
## 46 chr19 3078875 3078875 - 5 0 5
## 47 chr19 3078955 3078955 + 5 0 5
## 66 chr19 3080168 3080168 + 8 3 5
## 67 chr19 3080169 3080169 - 5 0 5
## 68 chr19 3080405 3080405 + 6 1 5
## --------------
## sample.id: DKO_2
## assembly: mm39
## context: CpG
## resolution: base
head(methst, n = 20)
## Methstatus condi
## 1 High WT
## 2 Mid WT
## 3 Mid WT
## 4 Mid WT
## 5 Mid WT
## 6 Mid WT
## 7 Full WT
## 8 High WT
## 9 Mid WT
## 10 Mid WT
## 11 Mid WT
## 12 High WT
## 13 Mid WT
## 14 Mid WT
## 15 High WT
## 16 Mid WT
## 17 Mid WT
## 18 High WT
## 19 High WT
## 20 High WT
head(AllDiffdm, n = 20)
## chr start end strand pvalue qvalue meth.diff coverage1
## 1 chr19 3078955 3078955 + 1.784885e-03 2.206729e-03 -60.25641 13
## 2 chr19 3080168 3080168 + 1.014442e-01 1.152308e-01 -26.19048 14
## 3 chr19 3080405 3080405 + 1.108735e-02 1.311989e-02 -40.95238 15
## 4 chr19 3080609 3080609 + 1.260733e-03 1.576794e-03 -49.37343 19
## 5 chr19 3082071 3082071 + 3.526216e-04 4.574229e-04 -52.84091 22
## 6 chr19 3082173 3082173 + 1.174317e-04 1.580269e-04 -58.37438 29
## 7 chr19 3083004 3083004 + 1.763275e-06 2.983851e-06 -86.66667 10
## 8 chr19 3083013 3083013 + 1.126153e-03 1.412938e-03 -58.57143 15
## 9 chr19 3083062 3083062 + 2.617650e-01 2.871652e-01 -21.42857 12
## 10 chr19 3089018 3089018 + 3.028682e-02 3.514409e-02 -35.19737 16
## 11 chr19 3089133 3089133 + 4.333091e-01 4.662384e-01 -11.11111 21
## 12 chr19 3089141 3089141 + 3.760882e-01 4.058690e-01 -11.18012 23
## 13 chr19 3108379 3108379 + 1.939531e-02 2.269161e-02 -35.96491 19
## 14 chr19 3128966 3128966 + 1.303456e-03 1.627205e-03 -42.44032 29
## 15 chr19 3130088 3130088 + 5.801824e-05 8.036371e-05 -58.18182 20
## 16 chr19 3130109 3130109 + 7.788339e-02 8.888793e-02 -32.35294 10
## 17 chr19 3130130 3130130 + 9.861751e-03 1.169118e-02 -42.48366 17
## 18 chr19 3130134 3130134 + 1.963794e-04 2.591131e-04 -58.23529 17
## 19 chr19 3130191 3130191 + 3.300512e-03 4.019663e-03 -39.25926 25
## 20 chr19 3130195 3130195 + 1.132097e-05 1.703015e-05 -60.00000 25
## numCs1 numTs1 coverage2 numCs2 numTs2 Methyl.Ctrl Methyl.Cond Methstatus_Ct
## 1 10 3 12 2 10 0.7692308 0.16666667 High
## 2 6 8 18 3 15 0.4285714 0.16666667 Mid
## 3 9 6 21 4 17 0.6000000 0.19047619 Mid
## 4 13 6 21 4 17 0.6842105 0.19047619 Mid
## 5 13 9 16 1 15 0.5909091 0.06250000 Mid
## 6 19 10 14 1 13 0.6551724 0.07142857 Mid
## 7 10 0 15 2 13 1.0000000 0.13333333 Full
## 8 12 3 14 3 11 0.8000000 0.21428571 High
## 9 6 6 14 4 10 0.5000000 0.28571429 Mid
## 10 9 7 19 4 15 0.5625000 0.21052632 Mid
## 11 14 7 27 15 12 0.6666667 0.55555556 Mid
## 12 19 4 21 15 6 0.8260870 0.71428571 High
## 13 10 9 18 3 15 0.5263158 0.16666667 Mid
## 14 19 10 26 6 20 0.6551724 0.23076923 Mid
## 15 18 2 22 7 15 0.9000000 0.31818182 High
## 16 5 5 17 3 14 0.5000000 0.17647059 Mid
## 17 11 6 18 4 14 0.6470588 0.22222222 Mid
## 18 15 2 20 6 14 0.8823529 0.30000000 High
## 19 20 5 27 11 16 0.8000000 0.40740741 High
## 20 20 5 25 5 20 0.8000000 0.20000000 High
## Methstatus_Cd Diff_expr DM_status
## 1 Low Significant Hypo
## 2 Low non-significant None
## 3 Low Significant Hypo
## 4 Low Significant Hypo
## 5 Low Significant Hypo
## 6 Low Significant Hypo
## 7 Low Significant Hypo
## 8 Low Significant Hypo
## 9 Mid non-significant None
## 10 Low Significant Hypo
## 11 Mid non-significant None
## 12 Mid non-significant None
## 13 Low Significant Hypo
## 14 Low Significant Hypo
## 15 Mid Significant Hypo
## 16 Low non-significant None
## 17 Low Significant Hypo
## 18 Mid Significant Hypo
## 19 Mid Significant Hypo
## 20 Low Significant Hypo
head(dfpc, n = 20)
## chr start end strand Meth_perc Coverage condi Diff_expr
## 1 chr19 3078955 3078955 + 0.7692308 13 WT Significant
## 2 chr19 3080168 3080168 + 0.4285714 14 WT non-significant
## 3 chr19 3080405 3080405 + 0.6000000 15 WT Significant
## 4 chr19 3080609 3080609 + 0.6842105 19 WT Significant
## 5 chr19 3082071 3082071 + 0.5909091 22 WT Significant
## 6 chr19 3082173 3082173 + 0.6551724 29 WT Significant
## 7 chr19 3083004 3083004 + 1.0000000 10 WT Significant
## 8 chr19 3083013 3083013 + 0.8000000 15 WT Significant
## 9 chr19 3083062 3083062 + 0.5000000 12 WT non-significant
## 10 chr19 3089018 3089018 + 0.5625000 16 WT Significant
## 11 chr19 3089133 3089133 + 0.6666667 21 WT non-significant
## 12 chr19 3089141 3089141 + 0.8260870 23 WT non-significant
## 13 chr19 3108379 3108379 + 0.5263158 19 WT Significant
## 14 chr19 3128966 3128966 + 0.6551724 29 WT Significant
## 15 chr19 3130088 3130088 + 0.9000000 20 WT Significant
## 16 chr19 3130109 3130109 + 0.5000000 10 WT non-significant
## 17 chr19 3130130 3130130 + 0.6470588 17 WT Significant
## 18 chr19 3130134 3130134 + 0.8823529 17 WT Significant
## 19 chr19 3130191 3130191 + 0.8000000 25 WT Significant
## 20 chr19 3130195 3130195 + 0.8000000 25 WT Significant
## DM_status Methstatus Methstatus_Ct Methstatus_Cd
## 1 Hypo High High Low
## 2 None Mid Mid Low
## 3 Hypo Mid Mid Low
## 4 Hypo Mid Mid Low
## 5 Hypo Mid Mid Low
## 6 Hypo Mid Mid Low
## 7 Hypo Full Full Low
## 8 Hypo High High Low
## 9 None Mid Mid Mid
## 10 Hypo Mid Mid Low
## 11 None Mid Mid Mid
## 12 None High High Mid
## 13 Hypo Mid Mid Low
## 14 Hypo Mid Mid Low
## 15 Hypo High High Mid
## 16 None Mid Mid Low
## 17 Hypo Mid Mid Low
## 18 Hypo High High Mid
## 19 Hypo High High Mid
## 20 Hypo High High Low
head(dfpc_annotated, n = 20)
## GRanges object with 20 ranges and 9 metadata columns:
## seqnames ranges strand | Meth_perc Coverage condi Diff_expr
## <Rle> <IRanges> <Rle> | <numeric> <numeric> <factor> <factor>
## [1] chr19 3078955 + | 0.769231 13 WT Significant
## [2] chr19 3080168 + | 0.428571 14 WT non-significant
## [3] chr19 3080405 + | 0.600000 15 WT Significant
## [4] chr19 3080609 + | 0.684211 19 WT Significant
## [5] chr19 3082071 + | 0.590909 22 WT Significant
## ... ... ... ... . ... ... ... ...
## [16] chr19 3128966 + | 0.655172 29 WT Significant
## [17] chr19 3130088 + | 0.900000 20 WT Significant
## [18] chr19 3130088 + | 0.900000 20 WT Significant
## [19] chr19 3130109 + | 0.500000 10 WT non-significant
## [20] chr19 3130109 + | 0.500000 10 WT non-significant
## DM_status Methstatus Methstatus_Ct Methstatus_Cd annot
## <factor> <factor> <factor> <factor> <GRanges>
## [1] Hypo High High Low chr19:1-3103070
## [2] None Mid Mid Low chr19:1-3103070
## [3] Hypo Mid Mid Low chr19:1-3103070
## [4] Hypo Mid Mid Low chr19:1-3103070
## [5] Hypo Mid Mid Low chr19:1-3103070
## ... ... ... ... ... ...
## [16] Hypo Mid Mid Low chr19:3125886-3195867:-
## [17] Hypo High High Mid chr19:3103071-3247732:-
## [18] Hypo High High Mid chr19:3125886-3195867:-
## [19] None Mid Mid Low chr19:3103071-3247732:-
## [20] None Mid Mid Low chr19:3125886-3195867:-
## -------
## seqinfo: 26 sequences from mm39 genome
#################################################################
# Methylation & coverage Statistics on raw data (cpg or tiles)
dfpc_dm <- NULL
groupe <- NULL
for(i in 1:length(dm_condi)){
print(paste0(LEVEL, " raw data"))
print("sample ID")
print(dm_condi[[i]]@sample.id)
groupe <- c(groupe, rep(strsplit(dm_condi[[i]]@sample.id, "_")[[1]][1], dim(dm_condi[[i]])[1]) )
dm_condi[[i]] %>% getData() %>%
dplyr::mutate(., Meth_perc = 100 * (numCs/coverage)) %>%
dplyr::mutate(., ID = dm_condi[[i]]@sample.id) -> df_dm
dfpc_dm <- rbind(dfpc_dm, df_dm)
rm(df_dm)
}
dfpc_dm <- cbind(dfpc_dm, groupe)
#Plot of CpG Methylation on raw data (before filtering coverage and unite)
p_dm <- ggplot(dfpc_dm, aes(x=Meth_perc, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% CpG Methylation on raw data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
p_dm
rm(groupe)
cov_dm <- ggplot(dfpc_dm, aes(x=coverage, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 200, 10)) +
ggtitle(paste0("Coverage on raw data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "Coverage", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
cov_dm
rm(dfpc_dm)
#################################################################
# Methylation & coverage Statistics on filtered data
dfpc_fdm <- NULL
groupe <- NULL
for(i in 1:length(f.dm_condi)){
print("f.dm_condi, filtered data")
print("sample ID")
print(f.dm_condi[[i]]@sample.id)
groupe <- c(groupe, rep(strsplit(f.dm_condi[[i]]@sample.id, "_")[[1]][1], dim(f.dm_condi[[i]])[1]) )
f.dm_condi[[i]] %>% getData() %>%
dplyr::mutate(., Meth_perc = 100 * (numCs/coverage)) %>%
dplyr::mutate(., ID = f.dm_condi[[i]]@sample.id) -> df_fdm
dfpc_fdm <- rbind(dfpc_fdm, df_fdm)
rm(df_fdm)
}
dfpc_fdm <- cbind(dfpc_fdm, groupe)
#Plot of CpG Methylation on filtered data
p_fdm <- ggplot(dfpc_fdm, aes(x=Meth_perc, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% CpG Methylation on filtered data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
p_fdm
rm(groupe)
cov_fdm <- ggplot(dfpc_fdm, aes(x=coverage, fill=groupe, color=groupe)) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Coverage on filtered data ", "(", LEVEL, ")")) +
facet_grid(. ~ ID) +
labs(x = "Coverage", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
cov_fdm
rm(dfpc_fdm)
For each condition, the plot shows the number of CpGs at different methylation status.
The methylation level is categorised into five classes:
- Full : 100% methylated
- High : between 75 and 100% excluded
- Mid : between 50 et 75% exluded
- Low : between 0 exluded and 25% excluded
- Unmeth : unmethylated
#################################################################
#Barplot Methylation Status bpst object
#requires methst object
bpst <- ggplot(data=methst ,
aes( x = condi, fill = factor( Methstatus, rev( levels(Methstatus) ) ) )
) +
geom_bar() +
labs(fill = '') +
scale_fill_manual("Methylation status : ",
values = c( 'Full' = full.color,
'High' = high.color,
'Mid' = mid.color,
'Low' = low.color,
'Unmeth' = unmeth.color)) +
ggtitle(paste0(CONDI, " Methylation status") ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5))
bpst
These plots give the proportion of differentially methylated tiles or CpG present in the three categories:
- Hyper: DKO > WT by more than 20 % and qvalue < 0.05
- Hypo: DKO < WT by more than 20 % and qvalue < 0.05
- None: no significant difference, qvalue > 0.05
The significance threshold of the methylation difference (20 %) is modifiable in the configuration file
as well as the threshold on the qvalue (0.05).
#################################################################
# Piechart Differentially methylated CpG or Tiles distribution
# piedm object on AllDiffdm$DM_status %>% table %>% data.frame
summary(AllDiffdm$DM_status)
## Hyper Hypo None
## 4 11559 1902
piedm <- ggplot(AllDiffdm$DM_status %>% table %>% data.frame,
aes(x="", y=Freq, fill=.)) +
geom_col() +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
coord_polar("y", start=0) +
theme_void() +
ggtitle(paste0(CONDI),
subtitle = paste0(" Differentially methylated ", LEVEL, " distribution (all)")
) +
theme(plot.title = element_text(hjust = 0.5))
piedm
# piedms object on filter(AllDiffdm, DM_status != \"None\") -> cnt
dplyr::filter(AllDiffdm, DM_status != "None") %>% dplyr::select(. , DM_status) %>% table %>% data.frame() -> cnt
piedms <- ggplot( cnt[-3,], aes(x="", y=Freq, fill=.) ) +
geom_col() +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color)
) +
coord_polar("y", start=0) +
theme_void() +
ggtitle(paste0(CONDI),
subtitle = paste0(" Differentially methylated " ,LEVEL, " distribution (Significant)")
) +
theme(plot.title = element_text(hjust = 0.5))
piedms
For each annotation track (promoters, exons …), the plots show the number of differentially methylated CpGs or tiles as a function of the methylation status in both conditions.
#################################################################
# New barplot fraction low mid high in Diff meth on All the Annots
for(i in IGV ){
dfpc_temp <- dfpc_annotated %>% data.frame
dfpc_temp <- dfpc_temp[ which( dfpc_temp$annot.type %in% i ) , ]
exist_annot <- row.names(table(dfpc_temp$annot.type))
if(is.na(match(x = i , exist_annot)) == FALSE){
dfpc_temp$annot.type <- factor(dfpc_temp$annot.type, levels = IGV[i] )
bp2ct <- ggplot( dfpc_temp, aes(x=condi, fill = DM_status)) +
geom_bar( ) +
facet_grid(~Methstatus, drop = FALSE) +
theme_bw(base_size = 14) +
scale_fill_manual("", values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
ggtitle(paste0(label = CONDI, " Differential methylation status"),
subtitle = paste0( "On annotation track ", i,
" and by ", LEVEL, " methylation level in ",
sample.ids[1])
) +
theme(
axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic")
)
print(bp2ct)
}
}
rm(dfpc_temp)
#################################################################
#Histogram of methylation all
histmeth <- ggplot(dfpc %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% of methylation, for all unified ", LEVEL)) +
facet_grid(. ~ condi, scales = "free", space = "free" ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
histmeth
#################################################################
#Histogram of methylation Signif
histmeths <- ggplot(dfpc_Sig %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("% of methylation, for differentially methylated ", LEVEL)) +
facet_grid(. ~ condi, scales = "free", space = "free" ) +
labs(x = "% of methylation", y = "Counts") +
theme_bw(base_size = 14) +
theme(
legend.title = element_text(),
axis.title.x=element_text(),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
strip.text.x = element_text(size = 12, color = "black", face = "bold.italic"))
histmeths
#################################################################
#Histogram of methylation all by group
histmethg <- ggplot(dfpc %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color= condi)
) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Methylation distribution on all ", LEVEL, ", by group") ) +
xlab("% of methylation") + ylab("Counts") +
theme_bw(base_size = 14) +
theme(legend.title = element_blank(),
plot.title = element_text(hjust = 0.5))
histmethg
#################################################################
#Histogram of methylation Signif by group
histmethsg <- ggplot(dfpc_Sig %>% data.frame,
aes(x=Meth_perc*100, fill= condi, color=condi)
) +
#scale_fill_manual(values=c("darkorange2", "cornflowerblue", "red", "yellow")) +
geom_histogram(position="identity",
alpha=0.6,
breaks = seq(0, 100, 5)) +
ggtitle(paste0("Methylation distribution on differentially methylated ", LEVEL, ", by group") ) +
xlab("% of methylation") + ylab("Counts") +
theme_bw(base_size = 14) +
theme(legend.title = element_blank(),
plot.title = element_text(hjust = 0.5))
histmethsg
#################################################################
#Violin plot of methylation All
#vp2
vp2 <- ggplot(dfpc, aes(x= condi, y=Meth_perc*100, fill = condi)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.05) +
labs(title = paste0("Methylation distribution on all ", LEVEL) ,
x = "Samples", y = "% of methylation" ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
legend.position = "none")
vp2
#Violin plot of methylation Signif
#vp2s
vps2 <- ggplot(dfpc_Sig, aes(x= condi , y=Meth_perc*100, fill = condi)) +
geom_violin(trim=FALSE) +
geom_boxplot(width=0.05) +
labs( title = paste0("Methylation distribution on differentially methylated ", LEVEL) ,
x = "Samples", y = "Methylation %" ) +
theme_bw(base_size = 14) +
theme(axis.title.x=element_blank(),
plot.title = element_text(hjust = 0.5),
legend.position = "none")
vps2
#################################################################
#Volcano plot
if(min(AllDiffdm$qvalue) == 0){
ymax <- .Machine$double.xmin # if qvalue == 0 , set the ylim max at the smallest representable number in R (2.225074e-308)
}else{
ymax <- min(AllDiffdm$qvalue)
}
vc <- ggplot(data = AllDiffdm,
aes(x = meth.diff, y = -log10(qvalue), col = DM_status)) +
geom_point() +
theme_bw(base_size = 14) +
labs(col = '') +
xlab("Methylation difference (%)") +
xlim(-100, 100) +
ylim(0, -log10(ymax)) +
scale_color_manual(values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
geom_vline(xintercept = c(-DIF, DIF), linetype="dashed", col="black") +
geom_hline(yintercept = -log10(QV), linetype="dashed", col="black") +
ggtitle(paste0(CONDI, " (", LEVEL, ")")) +
theme(plot.title = element_text(hjust = 0.5))
vc
for(i in IGV ){
Adtdf <- AllDiffdm_annotated %>% data.frame
Adtdf <- Adtdf[ which( Adtdf$annot.type %in% i ) , ]
exist_annot <- row.names(table(Adtdf$annot.type))
if(is.na(match(x = i , exist_annot)) == FALSE){
Adtdf$annot.type <- factor(Adtdf$annot.type, levels = IGV[i] )
vc_temp <- ggplot(data = Adtdf,
aes(x = meth.diff, y = -log10(qvalue), col = DM_status)) +
geom_point() +
theme_bw(base_size = 14) +
labs(col = '') +
xlab("Methylation difference (%)") +
xlim(-100, 100) +
ylim(0, -log10(ymax)) +
scale_color_manual(values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
geom_vline(xintercept = c(-DIF, DIF), linetype="dashed", col="black") +
geom_hline(yintercept = -log10(QV), linetype="dashed", col="black") +
ggtitle(paste0(CONDI, " (", LEVEL, "), on annotation ", i))
print(vc_temp)
}
}
rm(Adtdf)
rm(exist_annot)
# MH corrected for cases with low number of differences
if (length(AllDiffdm_annotated) > 40){
top_diff <- data.frame(AllDiffdm_annotated[AllDiffdm_annotated$pvalue <= sort(AllDiffdm_annotated$pvalue)[30] , ])
} else {
top_diff <- data.frame(AllDiffdm_annotated[AllDiffdm_annotated$pvalue <= sort(AllDiffdm_annotated$pvalue), ])
}
top_diff <- top_diff[, c(1,2,3,6,20, 28, 30)]
colnames(top_diff) <- c("Chr", "start", "end", "Pval", "Status", "Gene ID", "Annotation")
knitr::kable(top_diff , "pipe")
| Chr | start | end | Pval | Status | Gene ID | Annotation |
|---|---|---|---|---|---|---|
| chr19 | 3323686 | 3323686 | 0 | Hypo | ENSMUSG00000024831.15 | mm39_genes |
| chr19 | 3323686 | 3323686 | 0 | Hypo | NA | mm39_introns |
| chr19 | 3785574 | 3785574 | 0 | Hypo | ENSMUSG00000118231.2 | mm39_near_tss |
| chr19 | 3785574 | 3785574 | 0 | Hypo | ENSMUSG00000118096.2 | mm39_near_tss |
| chr19 | 3785574 | 3785574 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 3850608 | 3850608 | 0 | Hypo | ENSMUSG00000045098.20 | mm39_genes |
| chr19 | 3850608 | 3850608 | 0 | Hypo | ENSMUSG00000082848.2 | mm39_promoters |
| chr19 | 3850608 | 3850608 | 0 | Hypo | ENSMUSG00000082848.2 | mm39_genes |
| chr19 | 3850608 | 3850608 | 0 | Hypo | NA | mm39_introns |
| chr19 | 3850608 | 3850608 | 0 | Hypo | NA | mm39_introns |
| chr19 | 3880347 | 3880347 | 0 | Hypo | ENSMUSG00000082848.2 | mm39_genes |
| chr19 | 3880347 | 3880347 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4193611 | 4193611 | 0 | Hypo | ENSMUSG00000024842.9 | mm39_genes |
| chr19 | 4193611 | 4193611 | 0 | Hypo | ENSMUSG00000044724.10 | mm39_genes |
| chr19 | 4193611 | 4193611 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4193611 | 4193611 | 0 | Hypo | ENSMUSG00000044724.10 | mm39_exons |
| chr19 | 4193611 | 4193611 | 0 | Hypo | ENSMUSG00000024842.9 | mm39_near_tss |
| chr19 | 4211352 | 4211352 | 0 | Hypo | ENSMUSG00000024830.18 | mm39_genes |
| chr19 | 4211352 | 4211352 | 0 | Hypo | NA | mm39_cpg_shores |
| chr19 | 4211352 | 4211352 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4288368 | 4288368 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 4465828 | 4465828 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 4753520 | 4753520 | 0 | Hypo | ENSMUSG00000071691.12 | mm39_near_tss |
| chr19 | 4753520 | 4753520 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 4985269 | 4985269 | 0 | Hypo | ENSMUSG00000024901.16 | mm39_genes |
| chr19 | 4985269 | 4985269 | 0 | Hypo | NA | mm39_introns |
| chr19 | 4987726 | 4987726 | 0 | Hypo | ENSMUSG00000024901.16 | mm39_genes |
| chr19 | 4987726 | 4987726 | 0 | Hypo | NA | mm39_introns |
| chr19 | 5402650 | 5402650 | 0 | Hypo | ENSMUSG00000024846.7 | mm39_near_tss |
| chr19 | 5402650 | 5402650 | 0 | Hypo | NA | mm39_intergenic |
| chr19 | 5402650 | 5402650 | 0 | Hypo | NA | mm39_cpg_shelves |
rm(top_diff)
if(LEVEL == "Tiles"){
df_alldiff <- data.frame(AllDiffdm_annotated)
sort_alldif_annot <- df_alldiff[order(df_alldiff$pvalue, decreasing=FALSE),]
sort_annotated_genes <- sort_alldif_annot[ sort_alldif_annot$annot.type == paste0(ORG, "_genes"),]
background <- sort_annotated_genes[!duplicated(sort_annotated_genes[, "start"]),]
write.csv(background, file= paste0(OUTPUT_PATH , CONDI, "_background_tiles.csv"))
sort_sig_annotated_genes <- sort_annotated_genes[sort_annotated_genes$Diff_expr == "Significant", ]
write.csv(sort_sig_annotated_genes, file= paste0(OUTPUT_PATH , CONDI, "_significant_annot_tiles.csv"))
print(table(sort_sig_annotated_genes$DM_status))
}
Show the differential methylation status :
- Hyper
- Hypo
- None
On basic genomic annotation tracks.
#################################################################
#plot categorical
dm_annotated <- AllDiffdm_annotated %>% data.frame
dm_annotated$annot.type <- factor(dm_annotated$annot.type , levels = IGV )
anno_re <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar(position = "fill") +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", IGV ) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (relative counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 12, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
anno_re
anno_abs <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar() +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", IGV) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (absolute counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
anno_abs
if((CUSTOM_ANNOT == TRUE) && (merge_annot == FALSE)){
library(GenomicRanges)
library(annotatr)
df_custom_tracks <- data.frame(list_custom_tracks[[1]])
for(i in 2:length(list_custom_tracks)){
df <- data.frame(list_custom_tracks[[i]])
df_custom_tracks <- rbind(df_custom_tracks, df)
}
GR_custom_tracks <- makeGRangesFromDataFrame(df_custom_tracks, keep.extra.columns = TRUE)
for(i in 1:length(unique(meta_annot$group))){
print(i)
AllDiffdm_annotated <- annotate_regions(regions = AllDiffdm_regions,
annotations = GR_custom_tracks[GR_custom_tracks$group == unique(meta_annot$group)[i] ],
ignore.strand = TRUE,
quiet = TRUE)
dm_annotated <- AllDiffdm_annotated %>% data.frame
dm_annotated$annot.type <- factor(dm_annotated$annot.type )
anno_re_custom <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar(position = "fill") +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", dm_annotated$annot.type ) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (relative counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
print(anno_re_custom)
anno_abs_custom <- ggplot(data=dm_annotated, aes(x=annot.type, fill=DM_status)) +
geom_bar() +
scale_fill_manual("",
values = c( 'Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)
) +
scale_x_discrete("", labels = gsub("ORG_", "", dm_annotated$annot.type) ) +
theme_bw(base_size = 14) +
ggtitle( paste0('DM status on annotation tracks (absolute counts)'),
subtitle = paste0(CONDI) ) +
theme(axis.text.x = element_text( size = 8, angle = 45, vjust = 1, hjust = 1),
axis.title.x = element_blank(), legend.title = element_blank()
)
print(anno_abs_custom)
}
rm(df_custom_tracks)
rm(GR_custom_tracks)
rm(dm_annotated)
rm(AllDiffdm_annotated)
}
For each chromosome show the number of DMCs/DMTs hypo or hyper-methylated
The category named “others” correspond to chromsomes with unconventional names (like Unmapped chromsomes for example)
if( dim(dfpc_Sig)[1] > 0){
count_per_chromosome <- table(dfpc_Sig$chr, dfpc_Sig$DM_status)
count_df <- as.data.frame.matrix(count_per_chromosome)
count_df$chr <- rownames(count_df)
# test si absence de hypo ou hyper
if(dim(count_df)[2] == 2){
if(colnames(count_df[1]) == "hypo"){
count_df$Hyper <- rep(0, dim(count_df)[1])
}else{
count_df$Hypo <- rep(0, dim(count_df)[1])
}
}
chr_others <- count_df[1,]
rownames(chr_others) <- "others"
chr_others$Hypo <- chr_others$Hyper <- 1
chr_others$chr <- "others"
delete_row <- 0
for(i in 1:length(count_df$chr)){
if((nchar(count_df$chr[i]) > 5) == TRUE){
chr_others$Hypo <- (chr_others$Hypo + count_df[i,]$Hypo)
chr_others$Hyper <- (chr_others$Hyper + count_df[i,]$Hyper)
delete_row <- c(delete_row, i)
}
}
if(is.na(delete_row[2]) == FALSE){ # test si il n'y a aucune colonne à remove (delete_row = 0), car sinon génère des bugs
count_df <- count_df[-delete_row,]
}
count_df<- rbind(count_df, chr_others)
count_df$chr <- str_sort(count_df$chr, numeric = TRUE)
count_df_long <- tidyr::gather(count_df, key = "status", value = "count", Hypo, Hyper)
hist_dmr_chr <- ggplot(count_df_long, aes(x = chr, y = count, fill = status)) +
geom_bar(position = "dodge" , stat = "identity", colour = "black") +
labs(title = paste0("Number of differentially methylated ", LEVEL, " by chromosome"),
x = "Chromosome",
y = LEVEL) +
theme_bw(base_size = 14) +
scale_fill_manual("", values=c('Hyper' = hyper.color,
'Hypo' = hypo.color,
'None' = not.color)) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
rm(delete_row)
rm(count_df, count_df_long)
rm(chr_others)
hist_dmr_chr
}
Saves the whole R session of the script (variables, objects …)
at the end of its execution. The RData file produced can then be opened in a local environment (if you have enough space)
to make modifications. For example to retouch figures.
save.image(file = PLOT_RDATA)
# R Session
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Debian GNU/Linux 11 (bullseye)
##
## Matrix products: default
## BLAS/LAPACK: /opt/conda/envs/wgbsflow/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] stringr_1.5.0 methylKit_1.20.0 GenomicRanges_1.46.1
## [4] GenomeInfoDb_1.30.1 IRanges_2.28.0 S4Vectors_0.32.4
## [7] BiocGenerics_0.40.0 magrittr_2.0.3 dbplyr_2.3.0
## [10] ggplot2_3.3.6 yaml_2.3.5
##
## loaded via a namespace (and not attached):
## [1] MatrixGenerics_1.6.0 Biobase_2.54.0
## [3] tidyr_1.3.0 sass_0.4.5
## [5] jsonlite_1.8.4 splines_4.1.3
## [7] R.utils_2.12.2 gtools_3.9.4
## [9] bslib_0.4.2 assertthat_0.2.1
## [11] highr_0.10 GenomeInfoDbData_1.2.7
## [13] Rsamtools_2.10.0 numDeriv_2016.8-1.1
## [15] pillar_1.8.1 lattice_0.20-45
## [17] glue_1.6.2 limma_3.50.3
## [19] bbmle_1.0.25 digest_0.6.31
## [21] XVector_0.34.0 qvalue_2.26.0
## [23] colorspace_2.1-0 htmltools_0.5.4
## [25] Matrix_1.5-3 R.oo_1.25.0
## [27] plyr_1.8.8 XML_3.99-0.11
## [29] pkgconfig_2.0.3 emdbook_1.3.12
## [31] zlibbioc_1.40.0 purrr_1.0.1
## [33] mvtnorm_1.1-3 scales_1.2.1
## [35] BiocParallel_1.28.3 tibble_3.1.8
## [37] farver_2.1.1 generics_0.1.3
## [39] SummarizedExperiment_1.24.0 cachem_1.0.6
## [41] withr_2.5.0 cli_3.6.0
## [43] crayon_1.5.2 mclust_6.0.0
## [45] evaluate_0.20 R.methodsS3_1.8.2
## [47] fansi_1.0.4 MASS_7.3-58.2
## [49] tools_4.1.3 data.table_1.14.2
## [51] matrixStats_0.63.0 BiocIO_1.4.0
## [53] lifecycle_1.0.3 munsell_0.5.0
## [55] DelayedArray_0.20.0 Biostrings_2.62.0
## [57] compiler_4.1.3 jquerylib_0.1.4
## [59] fastseg_1.40.0 rlang_1.0.6
## [61] grid_4.1.3 RCurl_1.98-1.10
## [63] rjson_0.2.21 labeling_0.4.2
## [65] bitops_1.0-7 rmarkdown_2.20
## [67] restfulr_0.0.15 gtable_0.3.1
## [69] DBI_1.1.3 reshape2_1.4.4
## [71] R6_2.5.1 GenomicAlignments_1.30.0
## [73] knitr_1.42 dplyr_1.0.10
## [75] rtracklayer_1.54.0 bdsmatrix_1.3-6
## [77] fastmap_1.1.0 utf8_1.2.3
## [79] stringi_1.7.12 parallel_4.1.3
## [81] Rcpp_1.0.10 vctrs_0.5.2
## [83] tidyselect_1.2.0 xfun_0.37
## [85] coda_0.19-4